Table of Contents
from google.colab import drive
drive.mount('/content/drive')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sns
import random
import os
Regression analysis is the process of finding the function $f(x)$ that outputs the most similar value $\hat{y}$ to the dependent variable $y$ corresponding to the independent variable $x$.
$$ \hat{y} = f(x) ā y$$If $f(x)$ is a linear function, then this function is called a linear regression model.
$$ \hat{y} = \omega_0 + \omega_1x_1 + \omega_2x_2 + ... + \omega_Dx_D = \omega_0 + \omega^Tx$$In the above equation, the independent variable $x = (x_1, x_2, ... , x_D)$ is a $D$-dimension vector. The weight vector $\omega = (\omega_0, ... , \omega_D)$ is the coefficient of the function $f(x)$ and the parameter is this linear regression model.
Multi-neurons
Multi-layer perceptron
train_dataset = pd.read_excel('/content/drive/MyDrive/linc/data_files/train_dataset.xlsx',
index_col = 0,
engine = 'openpyxl')
train_dataset
train_dataset.describe()
train_x = train_dataset.drop('Weight (g)', axis = 1)
train_y = train_dataset.loc[:, 'Weight (g)']
test_dataset = pd.read_excel('/content/drive/MyDrive/linc/data_files/test_dataset.xlsx',
index_col = 0,
engine = 'openpyxl')
test_x = test_dataset.drop('Weight (g)', axis = 1)
test_y = test_dataset.loc[:, 'Weight (g)']
print('train_x: {}, train_y: {}'.format(train_x.shape, train_y.shape))
print('test_x: {}, test_y: {}'.format(test_x.shape, test_y.shape))
from sklearn.tree import DecisionTreeRegressor
reg = DecisionTreeRegressor(max_depth = 10000, random_state = 42).fit(train_x, train_y)
pred_dt = reg.predict(test_x)
def plot_(pred, test_y, name, size = True):
if size == True:
plt.figure(figsize = (8, 6))
plt.plot(pred, 'ro--', label = 'Prediction')
plt.plot(np.array(test_y), 'bo--', label = 'Ground Truth')
plt.legend(fontsize = 15)
plt.ylabel('Weight (g)', fontsize = 20)
plt.title('{}'.format(name), fontsize = 30)
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plot_(pred_dt, test_y, 'Decision Tree')
from sklearn.ensemble import RandomForestRegressor
reg = RandomForestRegressor(n_estimators = 100,
max_depth = 10000,
random_state = 42).fit(train_x, train_y)
pred_rf = reg.predict(test_x)
plot_(pred_rf, test_y, 'Random Forest')
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(train_x, train_y)
print('Regression Coefficient: \n{}\n'.format(reg.coef_))
print('Regression Bias: \n{}'.format(reg.intercept_))
pred_lr = reg.predict(test_x)
pred_lr
plot_(pred_lr, test_y, 'Linear Regression')
Unlike the toy example (e.g., MNIST dataset), when applying artificial intelligence to industrial applications, a variety of issues may arise. The curse of dimensionality is an example of one of them. Because several physical phenomena overlap in the case of industrial data, it is difficult to know important features for predicted values from the standpoint of domain knowledge.
To ensure statistical stability, the number of features should be chosen to correspond to the number of data points.
Therefore, we try correlation analysis to select important features in data-based prediction.
# ģ¼ź°ķ ė§ģ¤ķ¬ė„¼ ė§ė ė¤(ģ ģŖ½ ģ¼ź°ķģ True, ģė ģ¼ź°ķģ False)
mask = np.zeros_like(train_dataset.corr(), dtype = np.bool)
mask[np.triu_indices_from(mask)] = True
plt.figure(figsize = (24, 20))
sns.heatmap(train_dataset.corr(),
cmap = 'RdYlBu_r',
annot = True, # ģ¤ģ ź°ģ ķģķė¤
mask = mask, # ķģķģ§ ģģ ė§ģ¤ķ¬ ė¶ė¶ģ ģ§ģ ķė¤
linewidths = 1, # ź²½ź³ė©“ ģ¤ģ ģ¼ė” 구ė¶ķźø°
cbar_kws = {"shrink": .5}, # 컬ė¬ė° ķ¬źø° ģ ė°ģ¼ė” ģ¤ģ“źø°
vmin = -1, vmax = 1 # 컬ė¬ė° ė²ģ -1 ~ 1
)
plt.show()
weight_corr = train_dataset.corr()['Weight (g)']
weight_corr.sort_values(ascending = False)
del_features = weight_corr[weight_corr.abs() < 0.8]
del_features.keys()
train_x_fs = train_x.drop(list(del_features.keys()[5:]), axis = 1)
test_x_fs = test_x.drop(list(del_features.keys()[5:]), axis = 1)
print('train_x_fs: {}, train_y: {}'.format(train_x_fs.shape, train_y.shape))
print('test_x_fs: {}, test_y: {}'.format(test_x_fs.shape, test_y.shape))
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(train_x_fs, train_y)
print('Regression Coefficient: \n{}\n'.format(reg.coef_))
print('Regression Bias: \n{}'.format(reg.intercept_))
pred_s_lr = reg.predict(test_x_fs)
plot_(pred_s_lr, test_y, 'Linear Regression with FS')
tf.random.set_seed(42)
model = tf.keras.models.Sequential([
tf.keras.layers.Input(shape = (train_x.shape[1],)),
tf.keras.layers.Dense(units = 10, activation = 'relu'),
tf.keras.layers.Dense(units = 10, activation = 'relu'),
tf.keras.layers.Dense(units = 10, activation = 'relu'),
tf.keras.layers.Dense(units = 10, activation = 'relu'),
tf.keras.layers.Dense(units = 10, activation = 'relu'),
tf.keras.layers.Dense(units = 1, activation = None)
])
model.summary()
model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001),
loss = 'mse')
loss = model.fit(train_x, train_y, epochs = 50)
pred_dnn = model.predict(test_x)
plot_(pred_dnn, test_y, 'Deep Neural Network')
tf.random.set_seed(42)
model = tf.keras.models.Sequential([
tf.keras.layers.Input(shape = (train_x_fs.shape[1],)),
tf.keras.layers.Dense(units = 10, activation = 'relu'),
tf.keras.layers.Dense(units = 10, activation = 'relu'),
tf.keras.layers.Dense(units = 10, activation = 'relu'),
tf.keras.layers.Dense(units = 10, activation = 'relu'),
tf.keras.layers.Dense(units = 10, activation = 'relu'),
tf.keras.layers.Dense(units = 1, activation = None)
])
model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001),
loss = 'mse')
loss = model.fit(train_x_fs, train_y, epochs = 50)
pred_s_dnn = model.predict(test_x_fs)
plot_(pred_s_dnn, test_y, 'Deep Neural Network with FS')
plt.figure(figsize = (15, 20))
plt.subplot(321)
plot_(pred_dt, test_y, 'Decision Tree', False)
plt.subplot(322)
plot_(pred_rf, test_y, 'Random Forest', False)
plt.subplot(323)
plot_(pred_lr, test_y, 'Linear Regression', False)
plt.subplot(324)
plot_(pred_s_lr, test_y, 'Linear Regression with FS', False)
plt.subplot(325)
plot_(pred_dnn, test_y, 'Deep Neural Network', False)
plt.subplot(326)
plot_(pred_s_dnn, test_y, 'Deep Neural Network with FS', False)
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')